import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
##Import any other packages you may need here
# install packages
import sys
!{sys.executable} -m pip install plotly # you may want to restart your kernel if it can't find the plotly package
# after installing it
!{sys.executable} -m pip install "notebook>=5.3"
!{sys.executable} -m pip install "ipywidgets>=7.5"
# load packages
from IPython.display import display, Markdown, clear_output
# set pandas
pd.set_option('display.max_columns', None)
import plotly.express as px
import plotly.graph_objs as go
import seaborn as sns
from collections import Counter
from scipy.stats import chi2
from scipy import stats
import pydicom
# clear stuff
clear_output()
EDA is open-ended, and it is up to you to decide how to look at different ways to slice and dice your data. A good starting point is to look at the requirements for the FDA documentation in the final part of this project to guide (some) of the analyses you do.
This EDA should also help to inform you of how pneumonia looks in the wild. E.g. what other types of diseases it's commonly found with, how often it is found, what ages it affects, etc.
Note that this NIH dataset was not specifically acquired for pneumonia. So, while this is a representation of 'pneumonia in the wild,' the prevalence of pneumonia may be different if you were to take only chest x-rays that were acquired in an ER setting with suspicion of pneumonia.
Perform the following EDA:
Note: use full NIH data to perform the first a few EDA items and use sample_labels.csv for the pixel-level assassements.
Also, describe your findings and how will you set up the model training based on the findings.
## Below is some helper code to read data for you.
## Load NIH data
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
display(all_xray_df.sample(3))
## Load 'sample_labels.csv' data for pixel level assessments
sample_df = pd.read_csv('sample_labels.csv')
sample_df.sample(3)
#
# delete column
#
if 'Unnamed: 11' in all_xray_df.columns:
all_xray_df = all_xray_df.drop('Unnamed: 11', axis=1)
#
# categorise the labels into columns
#
if isinstance(all_xray_df['Finding Labels'].values[0], str):
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].str.split('|')
# grep all the labels
FINDING_LABELS = pd.Series(np.concatenate(all_xray_df['Finding Labels']).ravel())
for fn in FINDING_LABELS.value_counts().index:
all_xray_df[fn] = [1 if fn in v else 0 for v in all_xray_df['Finding Labels']]
#
# Column which identicate if the findings contain Pneumonia
#
all_xray_df['Pneumonia_Class'] = ['Has_Pneumonia' if v == 1 else 'Has_No_Pneumonia' for v in all_xray_df['Pneumonia']]
all_xray_df.info()
column = 'Patient Gender'
# general info
text = f'## {column} \n'
text += f'### General characteristics: \n'
text += f'<b>Amount of unique values\'s</b>: {all_xray_df[column].nunique()} \n'
text += f'<b>Amount of NaNs</b>: {all_xray_df[column].isnull().sum()} \n'
display(Markdown(text))
# value counts if the items
text = '<b>Value Counts:</b> \n'
v_sum = len(all_xray_df[column])
for k, v in all_xray_df[column].value_counts().items():
text += f'<b>{k}</b>: {v} - ({np.around(v/v_sum*100,2)})% \n'
display(Markdown(text))
# show the distribution
display(px.pie(all_xray_df[column].value_counts().reset_index(), values=column, names='index', color='index'))
text = f'## {column} & Pneumonia \n'
display(Markdown(text))
#
# Show the distribution by pneumonia
#
pd_cross_tab = pd.crosstab(all_xray_df[column], all_xray_df['Pneumonia_Class'])
pd_cross_tab['Total'] = pd_cross_tab.sum(axis=1)
display(pd_cross_tab)
# show plot
pd_cross_tab = pd_cross_tab.sort_index(ascending=False)
display(px.bar( pd.melt(pd_cross_tab.reset_index(), id_vars=[column], value_vars=pd_cross_tab.reset_index().columns[1:-1]),
x= 'Pneumonia_Class',
y= 'value',
color='Patient Gender',
barmode='group'))
#
# chi² test
#
text = f'## Chi²-Test \n'
display(Markdown(text))
# grep the data
cross_tab = pd.crosstab(all_xray_df[column], all_xray_df['Pneumonia_Class']).sort_index(ascending=False)
#cross_tab.loc[ 'F','Has_No_Pneumonia'] = 20000 # test to reject the 0 hypothesis
# Calculate the expected value
expected_values = cross_tab.sum(axis=1)[:,None] * np.ones((1,cross_tab.shape[1]))
expected_values *= (cross_tab.sum(axis=0).values * np.ones(cross_tab.shape))
expected_values /= cross_tab.sum().sum()
for e, col in enumerate(cross_tab.columns):
cross_tab[f'Expected {col}'] = expected_values[:,e]
# calculate the chi square value
chi_square = (cross_tab.values[:,:expected_values.shape[1]] - expected_values)**2 / expected_values
chi_square= chi_square.sum().sum()
# calculate the degrees of freedom
degrees_of_freedom = (expected_values.shape[0] - 1) * (expected_values.shape[1] - 1)
# show result
display(np.around(cross_tab,2))
text = f'<b> Chi² value</b>: {np.around(chi_square,2)} \n'
text += f'<b> P-value</b>: {0.05} \n'
text += f'<b> Degrees of freedom</b>: {degrees_of_freedom} \n'
display(Markdown(text))
if chi_square < chi2.isf(0.05, degrees_of_freedom):
text = f'<span style="color:#009900; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is < the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we won\'t reject the nullhypothesis which means that there are no significan differences between both groups'
else:
text = f'<span style="color:#990000; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is > the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we will reject the nullhypothesis which means that there is a significan differences between both groups'
display(Markdown(text))
Based on the above information we notice that there are more males in our dataset then females,
but based on the chi² test, gender does not have a significant influence on Pneumonia
column = 'Patient Age'
text = f'### {column} \n'
text += f'#### General characteristics: \n'
text += f'<b>Amount of unique values\'s</b>: {all_xray_df[column].nunique()} \n'
text += f'<b>Amount of NaNs</b>: {all_xray_df[column].isnull().sum()} \n'
text += f'<b>Amount of outliers</b>: {(all_xray_df[column] > 120).sum()} \n'
display(Markdown(text))
text = '<span style="color:#ff0000; font-weight:900"> Note that I\'ve removed the outliers for the next stats!</span> \n'
all_xray_df_without_outliers = all_xray_df[all_xray_df[column] < 120]
for k, v in all_xray_df_without_outliers.describe()[column].astype(np.int32).items():
text += f'<b>{k}</b>: {v} \n'
display(Markdown(text))
#
# student t-test
#
text = f'## T-Test \n'
display(Markdown(text))
display(np.around(all_xray_df_without_outliers.groupby('Pneumonia_Class').agg({column: ['min',
'mean',
'max',
'std',
"median",
'size']}).T, 2)
)
display(px.histogram(all_xray_df_without_outliers,
x=column,
color = 'Pneumonia_Class',
nbins=100,
log_y=True,
barmode='overlay',
title='Log - Histogram',
opacity=0.75))
display(Markdown('<b>Result t-test</b>'))
display(stats.ttest_ind(all_xray_df_without_outliers.loc[all_xray_df_without_outliers.Pneumonia_Class == 'Has_No_Pneumonia', column].values,
all_xray_df_without_outliers.loc[all_xray_df_without_outliers.Pneumonia_Class == 'Has_Pneumonia', column].values,
equal_var = False))
text = f'## Chi²-Test \n'
display(Markdown(text))
# grep the data
all_xray_df_without_outliers = all_xray_df_without_outliers.sort_values(column).copy()
bins = np.array(list(range(0,101,5)))
inds = np.digitize(all_xray_df_without_outliers[column], bins)
all_xray_df_without_outliers[f'{column}_bins'] = [ f'[{bins[inds[n]-1]} - {bins[inds[n]]}[' for n in range(len(inds))]
cross_tab = pd.crosstab(all_xray_df_without_outliers[f'{column}_bins'], all_xray_df_without_outliers['Pneumonia_Class'])
#cross_tab = cross_tab.sort_index()
#cross_tab.loc[ 'F','Has_No_Pneumonia'] = 20000 # test to reject the 0 hypothesis
# Calculate the expected value
expected_values = cross_tab.sum(axis=1)[:,None] * np.ones((1,cross_tab.shape[1]))
expected_values *= (cross_tab.sum(axis=0).values * np.ones(cross_tab.shape))
expected_values /= cross_tab.sum().sum()
for e, col in enumerate(cross_tab.columns):
cross_tab[f'Expected {col}'] = expected_values[:,e]
# calculate the chi square value
chi_square = (cross_tab.values[:,:expected_values.shape[1]] - expected_values)**2 / expected_values
chi_square= chi_square.sum().sum()
# calculate the degrees of freedom
degrees_of_freedom = (expected_values.shape[0] - 1) * (expected_values.shape[1] - 1)
# show result
display(np.around(cross_tab,2))
ct_i = cross_tab.reset_index()
ct_i['i'] = [ int(x.split(' ', 1)[0][1:]) for x in ct_i['Patient Age_bins']]
ct_i = ct_i.sort_values('i')
ct_i['Patient Age_bins'] = ct_i['Patient Age_bins'].astype(str)
display(px.bar(pd.melt(frame = ct_i,
id_vars = ['Patient Age_bins'],
value_vars = ['Has_Pneumonia','Expected Has_Pneumonia']),
x='Patient Age_bins',
y='value',
color= 'Pneumonia_Class',
barmode='group',
title='Histogram',
opacity=0.75))
text = f'<b> Chi² value</b>: {np.around(chi_square,2)} \n'
text += f'<b> P-value</b>: {0.05} \n'
text += f'<b> Degrees of freedom</b>: {degrees_of_freedom} \n'
display(Markdown(text))
if chi_square < chi2.isf(0.05, degrees_of_freedom):
text = f'<span style="color:#009900; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is < the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we won\'t reject the nullhypothesis which means that there are no significan differences between both groups'
else:
text = f'<span style="color:#990000; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is > the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we will reject the nullhypothesis which means that there is a significan differences between both groups'
display(Markdown(text))
According to the above tests, we can conclude that the distributions of Has Pneumonia and Has No Pneumonia are different based by age.
This means that Age might have an influence on Pneumonia
column = 'View Position'
# general info
text = f'## {column} \n'
text += f'### General characteristics: \n'
text += f'<b>Amount of unique values\'s</b>: {all_xray_df[column].nunique()} \n'
text += f'<b>Amount of NaNs</b>: {all_xray_df[column].isnull().sum()} \n'
display(Markdown(text))
# value counts if the items
text = '<b>Value Counts:</b> \n'
v_sum = len(all_xray_df)
for k, v in all_xray_df[column].value_counts().items():
text += f'<b>{k}</b>: {v} - {np.around(v/v_sum*100,2)}% \n'
display(Markdown(text))
# show the distribution
display(px.pie(all_xray_df[column].value_counts().reset_index(), values=column, names='index', color='index'))
text = f'## {column} & Pneumonia \n'
display(Markdown(text))
#
# Show the distribution by pneumonia
#
pd_cross_tab = pd.crosstab(all_xray_df[column], all_xray_df['Pneumonia_Class'])
pd_cross_tab['Total'] = pd_cross_tab.sum(axis=1)
display(pd_cross_tab)
# show plot
pd_cross_tab = pd_cross_tab.sort_index(ascending=False)
display(px.bar( pd.melt(pd_cross_tab.reset_index(), id_vars=[column], value_vars=pd_cross_tab.reset_index().columns[1:-1]),
x= 'Pneumonia_Class',
y= 'value',
color=column,
barmode='group'))
#
# chi² test
#
text = f'## Chi²-Test \n'
display(Markdown(text))
# grep the data
cross_tab = pd.crosstab(all_xray_df[column], all_xray_df['Pneumonia_Class']).sort_index(ascending=False)
#cross_tab.loc[ 'F','Has_No_Pneumonia'] = 20000 # test to reject the 0 hypothesis
# Calculate the expected value
expected_values = cross_tab.sum(axis=1)[:,None] * np.ones((1,cross_tab.shape[1]))
expected_values *= (cross_tab.sum(axis=0).values * np.ones(cross_tab.shape))
expected_values /= cross_tab.sum().sum()
for e, col in enumerate(cross_tab.columns):
cross_tab[f'Expected {col}'] = expected_values[:,e]
# calculate the chi square value
chi_square = (cross_tab.values[:,:expected_values.shape[1]] - expected_values)**2 / expected_values
chi_square= chi_square.sum().sum()
# calculate the degrees of freedom
degrees_of_freedom = (expected_values.shape[0] - 1) * (expected_values.shape[1] - 1)
# show result
display(np.around(cross_tab,2))
text = f'<b> Chi² value</b>: {np.around(chi_square,2)} \n'
text += f'<b> P-value</b>: {0.05} \n'
text += f'<b> Degrees of freedom</b>: {degrees_of_freedom} \n'
display(Markdown(text))
if chi_square < chi2.isf(0.05, degrees_of_freedom):
text = f'<span style="color:#009900; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is < the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we won\'t reject the nullhypothesis which means that there are no significan differences between both groups'
else:
text = f'<span style="color:#990000; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is > the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we will reject the nullhypothesis which means that there is a significan differences between both groups'
display(Markdown(text))
Conclusion: In the above stats, we can see that the distributions of laying ont their right (PA) or left (AP) side are significantly different. It seems to be that it might be easier to detect Pneumonia when the patient lays on his left.
column = 'Finding Labels'
# select the labels
df_labels = pd.melt(all_xray_df, id_vars=['Pneumonia_Class'], value_vars=list(FINDING_LABELS.value_counts().index))
df_labels = df_labels[df_labels.value > 0]
# general info
text = f'## {column} \n'
text += f'### General characteristics: \n'
text += f'<b>Amount of unique values\'s</b>: {df_labels["variable"].nunique()} \n'
text += f'<b>Amount of NaNs</b>: {df_labels["variable"].isnull().sum()} \n'
display(Markdown(text))
# value counts if the items
text = '<b>Value Counts:</b> \n'
v_sum = len(df_labels["variable"])
for k, v in df_labels["variable"].value_counts().items():
text += f'<b>{k}</b>: {v} - {np.around((v/v_sum)*100,2)}% \n'
display(Markdown(text))
# show the distribution
display(px.pie(df_labels["variable"].value_counts().reset_index(), values='variable', names='index', color='index'))
#
# Labels en pneumonia
#
text = f'## Labels & Pneumonia \n'
display(Markdown(text))
#
# Show the distribution by pneumonia
#
pd_cross_tab = pd.crosstab(df_labels.Pneumonia_Class, df_labels.variable)[list(FINDING_LABELS.value_counts().index)].T
pd_cross_tab['Total'] = pd_cross_tab.sum(axis=1)
display(pd_cross_tab)
#
# chi² test
#
text = f'## Chi²-Test \n<b>Note</b>, we\'ve remove the <i>Pneumonia</i> and <i>No Finding</i> label'
display(Markdown(text))
# grep the data
cross_tab = pd.crosstab(df_labels.Pneumonia_Class, df_labels.variable)[list(FINDING_LABELS.value_counts().index)].T
cross_tab = cross_tab[~cross_tab.index.isin(['No Finding','Pneumonia'])]
#cross_tab.loc[ 'F','Has_No_Pneumonia'] = 20000 # test to reject the 0 hypothesis
# Calculate the expected value
expected_values = cross_tab.sum(axis=1)[:,None] * np.ones((1,cross_tab.shape[1]))
expected_values *= (cross_tab.sum(axis=0).values * np.ones(cross_tab.shape))
expected_values /= cross_tab.sum().sum()
for e, col in enumerate(cross_tab.columns):
cross_tab[f'Expected {col}'] = expected_values[:,e]
# calculate the chi square value
chi_square = (cross_tab.values[:,:expected_values.shape[1]] - expected_values)**2 / expected_values
chi_square= chi_square.sum().sum()
# calculate the degrees of freedom
degrees_of_freedom = (expected_values.shape[0] - 1) * (expected_values.shape[1] - 1)
# show result
display(np.around(cross_tab,2))
text = f'<b> Chi² value</b>: {np.around(chi_square,2)} \n'
text += f'<b> P-value</b>: {0.05} \n'
text += f'<b> Degrees of freedom</b>: {degrees_of_freedom} \n'
display(Markdown(text))
if chi_square < chi2.isf(0.05, degrees_of_freedom):
text = f'<span style="color:#009900; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is < the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we won\'t reject the nullhypothesis which means that there are no significan differences between both groups'
else:
text = f'<span style="color:#990000; font-weight:900"> The chi² value of {np.around(chi_square, 2)} is > the critical value of {np.around(chi2.isf(0.05, degrees_of_freedom),2)}</span>: \n'
text += f'Therefore we will reject the nullhypothesis which means that there is a significan differences between both groups'
display(Markdown(text))
Conclusion: There are some labels which are highly corrolated with Pneumonia:
+ Both distributions are different
display(all_xray_df.Pneumonia_Class.value_counts().reset_index())
display(px.pie(all_xray_df.Pneumonia_Class.value_counts().reset_index(),
values='Pneumonia_Class',
names='index',
color='index'))
labels = FINDING_LABELS.value_counts().index
# count the disease per patient
text = f"Amount of patients: {all_xray_df['Patient ID'].nunique()} \n"
amount_of_no_diseases = all_xray_df.groupby('Patient ID').agg({c : 'max' for c in labels if c not in ['No Finding']}).sum(axis=1).value_counts()[0]
text += f"Amount of patients without a disease: {amount_of_no_diseases} - {np.around(amount_of_no_diseases / all_xray_df['Patient ID'].nunique() * 100,2)}% \n"
text += f"Amount of patients with a deasease: {all_xray_df['Patient ID'].nunique() - amount_of_no_diseases} - {np.around((all_xray_df['Patient ID'].nunique() - amount_of_no_diseases) / all_xray_df['Patient ID'].nunique() * 100,2)}% \n"
display(Markdown(text))
df_disease_per_patient = all_xray_df.groupby('Patient ID').agg({c : 'max' for c in labels if c not in ['No Finding']}).sum(axis=1).value_counts().reset_index()
df_disease_per_patient.columns = ['Amount of Diseases', 'Amount of Patients']
df_disease_per_patient['%Patients'] = np.around(df_disease_per_patient['Amount of Patients'] / df_disease_per_patient['Amount of Patients'].sum(axis=0)*100, 2)
display(df_disease_per_patient)
display(px.pie(df_labels["variable"].value_counts().reset_index(), values='variable', names='index', color='index'))
Note, if a patient has a disease, it is most likely that it is only 1 disease.
labels = [x for x in FINDING_LABELS.value_counts().index]
co_occurence_matrix = all_xray_df[all_xray_df['Pneumonia_Class'] == 'Has_Pneumonia'][labels].T.dot(all_xray_df[all_xray_df['Pneumonia_Class'] == 'Has_Pneumonia'][labels])
# normalize the data
co_occurence_matrix = np.around(co_occurence_matrix / all_xray_df[all_xray_df['Pneumonia_Class'] == 'Has_Pneumonia'].shape[0], 2)
# show to co-occurence matrix
cm = sns.light_palette("green", as_cmap=True)
s = co_occurence_matrix.style.background_gradient(cmap=cm)
s
Note, from the above stats, we see that Pneumonia is co-occur frequently with:
def plot_image_info(data, info, img=None, norm_modus = 0):
if img is None:
img = img = plt.imread(data['IMAGE_PATH'])
## normalize image
img = (img - img.mean())/img.std() if norm_modus == 0 else img/255
## plot the image and its pixel intensity distribution
fig = plt.figure()
fig.set_figwidth(20)
## Plotting Image
sub1 = fig.add_subplot(1,2,1)
sub1.set_title(f'X-Ray - Patient-ID: ' + ' - '.join([str(data[x]) for x in info]))
sub1.imshow(img, cmap='gray')
## Plotting Pixel Intensity
sub2 = fig.add_subplot(1,2,2)
sub2.set_title(f'Pixel Intensity')
plt.hist(img.ravel(), bins=256)
df_dcm = []
scans = []
#loop over the dicom data
for dcm_file in glob('*dcm'):
# get the info
dcm = pydicom.dcmread(dcm_file)
# get the info
info = {}
for key in dcm.keys():
if ' UID' in dcm[key].name:
continue
if 'Pixel Data' == dcm[key].name:
scans.append(dcm.pixel_array)
continue
info[dcm[key].name] = dcm[key].value
# add to list
df_dcm.append(info)
df_dcm = pd.DataFrame(df_dcm).sort_values('Patient ID')
text= f'## Dicom Data'
display(Markdown(text))
display(df_dcm)
text= f'## Dicom Images'
display(Markdown(text))
for img, (_, row) in zip(scans, df_dcm.iterrows()):
plot_image_info(row,
info = ['Patient ID',"Patient's Sex", "Patient's Age", "Body Part Examined", 'Patient Position', 'Study Description'],
img = img,
norm_modus=0)
#
# categorise the labels into columns
#
if isinstance(sample_df['Finding Labels'].values[0], str):
sample_df['Finding Labels'] = sample_df['Finding Labels'].str.split('|')
# grep all the labels
FINDING_LABELS = pd.Series(np.concatenate(sample_df['Finding Labels']).ravel())
FINDING_LABELS_U = np.unique(FINDING_LABELS)
for fn in FINDING_LABELS.value_counts().index:
sample_df[fn] = [1 if fn in v else 0 for v in sample_df['Finding Labels']]
#
# Column which identicate if the findings contain Pneumonia
#
sample_df['Pneumonia_Class'] = ['Has_Pneumonia' if v == 1 else 'Has_No_Pneumonia' for v in sample_df['Pneumonia']]
#
# images path
#
images_2_path = {x.rsplit('/', 1)[1] :x for x in glob('/data/images_*/images/*')}
sample_df['IMAGE_PATH'] = [images_2_path.get(x) for x in sample_df['Image Index']]
#
# sort_values
#
sample_df = sample_df.sort_values(['Patient ID', 'Follow-up #'])
sample_df.head()
text= f'### Age - Gender'
display(Markdown(text))
text= f'Images of patients with no disease'
display(Markdown(text))
sample_df_tmp = sample_df[sample_df[FINDING_LABELS_U].sum(axis=1) == sample_df['No Finding']]
sample_df_tmp = sample_df_tmp.sample(sample_df_tmp.shape[0])
sample_df_tmp = sample_df_tmp[sample_df_tmp['Patient Age'].isin(['010Y', '018Y', '030Y', '050Y', '080Y'])]
sample_df_tmp = sample_df_tmp.sort_values(['Follow-up #','View Position'], ascending=[True, False])
sample_df_tmp = sample_df_tmp.drop_duplicates(subset= ['Patient Gender', 'Patient Age'])
sample_df_tmp = sample_df_tmp.sort_values(['Patient Age', 'Patient Gender'])
for _, row in sample_df_tmp.iterrows():
plot_image_info(row,
info = ['Patient ID', 'Finding Labels','Follow-up #','Patient Age','Patient Gender','View Position'],
img = None,
norm_modus = 1)
text= f'### View Position'
display(Markdown(text))
sample_df_tmp = sample_df[sample_df[FINDING_LABELS_U].sum(axis=1) == sample_df['No Finding']]
sample_df_tmp = sample_df_tmp[(sample_df_tmp['Patient Age'] == '030Y') ]
sample_df_tmp = sample_df_tmp.drop_duplicates(subset = ['Patient ID', 'View Position'])
sample_df_tmp['View Position Amount'] = sample_df_tmp.groupby(['Patient ID'])['View Position'].transform('count')
sample_df_tmp = sample_df_tmp[sample_df_tmp['View Position Amount'] > 1]
for _, row in sample_df_tmp.iterrows():
plot_image_info(row,
info = ['Patient ID', 'Finding Labels','Follow-up #','Patient Age','Patient Gender','View Position'],
img = None,
norm_modus = 0)
text= f'### Different Diseases '
display(Markdown(text))
for finding in FINDING_LABELS.value_counts().index:
sample_df_tmp = sample_df[sample_df[FINDING_LABELS_U].sum(axis=1) == sample_df[finding]]
sample_df_tmp = sample_df_tmp[(sample_df_tmp['Patient Age'].isin([f'0{i}Y' for i in range(30,40)])) & \
(sample_df_tmp['Patient Gender'] == 'M') & \
(sample_df_tmp['View Position'] == 'AP') ]
sample_df_tmp = sample_df_tmp.sort_values(['Follow-up #'])
for _, row in sample_df_tmp[:1].iterrows():
plot_image_info(row,
info = ['Patient ID', 'Finding Labels','Follow-up #','Patient Age','Patient Gender','View Position'],
img = None,
norm_modus = 1)
text= f'### Pneumonia vs No Pneumonia'
display(Markdown(text))
text= f'Images & distributions of patients with different ages and gender'
display(Markdown(text))
sample_df_tmp = sample_df[(sample_df[FINDING_LABELS_U].sum(axis=1) == sample_df['No Finding']) | \
(sample_df[FINDING_LABELS_U].sum(axis=1) == sample_df['Pneumonia'])]
sample_df_tmp = sample_df_tmp[sample_df_tmp['View Position'] == 'AP']
sample_df_tmp = sample_df_tmp.sort_values('Follow-up #')
sample_df_tmp['Finding Labels'] = [str(x) for x in sample_df_tmp['Finding Labels']]
sample_df_tmp = sample_df_tmp.sample(len(sample_df_tmp))
sample_df_tmp = sample_df_tmp.drop_duplicates(subset = ['Patient Age','Patient Gender', 'Finding Labels'])
sample_df_tmp['Amount'] = sample_df_tmp.groupby(['Patient Age','Patient Gender'])['Finding Labels'].transform('size')
sample_df_tmp = sample_df_tmp[sample_df_tmp['Amount'] > 1]
sample_df_tmp = sample_df_tmp.sort_values('Patient Age')
sample_df_tmp = sample_df_tmp.sort_values(['Patient Age', 'Finding Labels', 'Patient Gender'], ascending=[True, False, True])
for _, row in sample_df_tmp.iterrows():
plot_image_info(row,
info = ['Patient ID', 'Finding Labels','Follow-up #','Patient Age','Patient Gender','View Position'],
img = None,
norm_modus = 1)
From the above images and pixel intensities we can see that not all the images are cristal clear. But in general, healty people should have a lower pixel intensity (less brighter pixels) then non healthy people.